program gen_diff
	gen diff_mortgage = abs(mortgage - loan_val)/loan_val
	gen diff_date = abs(dated - date)
end

program min_mortgage_distance
	bysort `1': egen mindiff_mortgage = min(diff_mortgage)
	bysort `1' (diff_mortgage): keep if diff_mortgage == mindiff_mortgage
	codebook schemeid
end

program min_date_distance
	bysort `1': egen mindiff_date = min(diff_date)
	bysort `1' (diff_date): keep if diff_date == mindiff_date
	codebook schemeid
end



*------------------------------------------------------------------------------*
* Joinby *
*------------------------------------------------------------------------------*


*============= MATCH 1: pcd purchaseprice lender_id ===================

use "${datadir}\HCA_full.dta", clear

joinby pcd purchaseprice lender_id using "${datadir}\HTB_full"	


*--- Delete duplicates in schemeid
gen_diff
min_mortgage_distance schemeid
min_date_distance schemeid
bysort schemeid (psd_id): keep if _n == 1	// arbitrary but replicable (no other obvious sorting that would be better)

*--- Delete duplicates in psd_id
drop min* 

min_mortgage_distance psd_id
min_date_distance psd_id
bysort psd_id (schemeid): keep if _n == 1	



*--- Drop implausible matches
drop if diff_mortgage > .1 // 615 obs (less than 1%)
drop if diff_date > 90 // 424 obs (less than 1%)

gen match_quality = 1

so _all
save "${datadir}\matched", replace


*============= MATCH 2: postcode3 purchaseprice lender_id ============

*--- Preparation
// relevant PSD data
use "${datadir}\HTB_full", clear

merge 1:1 psd_id using "${datadir}\matched", ///
	keepusing(psd_id) nogen keep(1)

save "${datadir}\HTB_full_after_step1_match", replace


// relevant HCA data
use "${datadir}\HCA_full.dta", clear

merge 1:1 schemeid using "${datadir}\matched", ///
	keepusing(schemeid) nogen keep(1)	// 32,094
ren pdistrict postcode3
joinby postcode3 purchaseprice lender_id using ///
	"${datadir}\HTB_full_after_step1_match"


*--- Delete duplicates in schemeid
gen_diff	
min_mortgage_distance schemeid
min_date_distance schemeid
bysort schemeid (psd_id): keep if _n == 1	

*--- Delete duplicates in psd_id
drop min* 
min_mortgage_distance psd_id
min_date_distance psd_id
bysort psd_id (schemeid): keep if _n == 1	


*--- Step 4: drop implausible matches
sum diff_mortgage, det
drop if diff_mortgage > .1 // 1/6 obs

sum diff_date, det
drop if diff_date > 90 // 1/5 obs

gen match_quality = 2
save "${datadir}\matched2", replace




*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*
* Create dataset for analysis *
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

use "${datadir}\HTB_full", clear // 2,170,761 obs, saved on 4/12/2018


merge 1:1 psd_id using "${datadir}\matched", ///
	keepusing(mortgage equity deposit schemeid ///
	match_quality) nogen

merge 1:1 psd_id using "${datadir}\matched2", ///
	keepusing(mortgage equity deposit schemeid match_quality) nogen update
	
	
gen htb_dummy = (schemeid !=.)


*--- Fixes and variables
gen loan_final 	= cond(htb_dummy==1,mortgage,loan_val)
gen ltv_final 	= loan_final / purchaseprice * 100

gen 		interest00 		= interest/100
gen 		payment			= loan_val * ( interest00/12 * (1+interest00/12)^(mortgage_term * 12) ) / ((1+interest00/12)^(mortgage_term * 12) - 1) 

*--- RESTRICTION 1: only new or htb

preserve
	keep if new == 0 & htb_dummy == 0
	save "${datadir}\HTB_main_bunching600old", replace
restore

keep if new==1 | htb_dummy == 1


*--- RESTRICTION 2: only properly val <=1,000,000
keep if property_val <= 1000000	// 7 HTB eliminated, probably wrong matches (values > 1m)
keep if purchaseprice <= 1000000


*--- RESTRICTION 3: drop if gov scheme in PSD and not matched 
drop if gov & !htb_dummy


*--- Highlight London HTB
gen htb_london = region == 5 & date >= mdy(2, 1, 2016) 



*--- Prepare house price data 
// Match the local authorities
merge m:1 pcd using "${datadir}\la_region_postcode_lookup", keep(1 3) nogen
ren localauthoritycode areacode 


// Appreciation all
gen datem = mofd(date) + 24
merge m:1 datem areacode using "${datadir}\local_price_appreciation", ///
	keep(1 3) nogen keepusing(appreciation2)
rename datem datem2y
rename appreciation2 fw_appreciation2
gen datem = datem2y - 24
	
	
merge m:1 datem areacode using /// regionname
	"${datadir}\local_price_appreciation", ///
	keep(1 3) nogen keepusing(appreciation2 appreciation1)
rename appreciation2 bw_appreciation2
rename appreciation1 bw_appreciation1

	
// Appreciation new
gen dateq = qofd(date) + 8
ren areacode laua
merge m:1 dateq laua using ///
	"${datadir}\la_prevnew_appreciation", keep(1 3) 
ren dateq dateq2y
gen dateq = dateq - 8


*--- Variables 
gen deposit_final 		= deposit
replace deposit_final 	= purchaseprice - loan_final if missing(deposit_final)
gen DTV_final 			= deposit_final / purchaseprice  * 100

gen combined_LTI = cond(htb_dummy, LTI + equity/gross_income, LTI)
gen combined_LTV = cond(htb_dummy, LTV + 100 * equity/purchaseprice, LTV)
		
gen 		PTI 			= payment / (gross_income/12) * 100
gen 		DTI_final		= deposit_final / gross_income


tab advtype, gen(advtype_)
tab employment, gen(employment_)

replace gross_income 	= gross_income/1000
replace purchaseprice 	= purchaseprice/1000
replace property_val 	= property_val/1000
replace deposit_final 	= deposit_final/1000
replace loan_val 		= loan_val/1000
replace loan_final 		= loan_final/1000
replace payment 		= payment/1000	

replace equity 			= 0	if !htb_dummy 
replace equity 			= equity/1000

gen fix2 = ratetype == 1 & dealtype == 2
gen fix_other = ratetype == 1 & !fix2




* from gross to net income

* tab income_basis, nol // for 40% we don't know if single or joint income, suppose is joint

gen 	gross_income_single = .
replace gross_income_single = gross_income  	if income_basis ==3
replace gross_income_single = gross_income/2  	if income_basis ==2  
replace gross_income_single = gross_income  	if income_basis ==1 // this can be problematic as income basis in unknown
 
gen 	personal_allowance = .
replace personal_allowance = 9.940 		if inrange(date, mdy(4, 1, 2013), mdy(3, 31, 2014)) 
replace personal_allowance = 10.000 	if inrange(date, mdy(4, 1, 2014), mdy(3, 31, 2015)) 
replace personal_allowance = 10.600 	if inrange(date, mdy(4, 1, 2015), mdy(3, 31, 2016)) 
replace personal_allowance = 11.000 	if inrange(date, mdy(4, 1, 2016), mdy(3, 31, 2017)) 
replace personal_allowance = 11.500 	if inrange(date, mdy(4, 1, 2017), mdy(3, 31, 2018)) 


gen 	threshold_1 = .
replace threshold_1 = 32.010 	if inrange(date, mdy(4, 1, 2013), mdy(3, 31, 2014)) 
replace threshold_1 = 31.865 	if inrange(date, mdy(4, 1, 2014), mdy(3, 31, 2015)) 
replace threshold_1 = 31.785 	if inrange(date, mdy(4, 1, 2015), mdy(3, 31, 2016)) 
replace threshold_1 = 32.000 	if inrange(date, mdy(4, 1, 2016), mdy(3, 31, 2017)) 
replace threshold_1 = 33.500 	if inrange(date, mdy(4, 1, 2017), mdy(3, 31, 2018)) 


gen 	threshold_2 = .
replace threshold_2 = 150.000 	if inrange(date, mdy(4, 1, 2013), mdy(3, 31, 2014)) 
replace threshold_2 = 150.000 	if inrange(date, mdy(4, 1, 2014), mdy(3, 31, 2015)) 
replace threshold_2 = 150.000 	if inrange(date, mdy(4, 1, 2015), mdy(3, 31, 2016)) 
replace threshold_2 = 150.000 	if inrange(date, mdy(4, 1, 2016), mdy(3, 31, 2017)) 
replace threshold_2 = 150.000 	if inrange(date, mdy(4, 1, 2017), mdy(3, 31, 2018)) 


gen 	income_tax = .
replace income_tax = 0 													if gross_income_single < personal_allowance
replace income_tax = 0.20 * (gross_income_single - personal_allowance) 	if gross_income_single >= personal_allowance & gross_income_single < threshold_1
replace income_tax = 0.20 * (threshold_1 - personal_allowance) 	///
					+ 0.40 * (gross_income_single - threshold_1)  		if gross_income_single >= threshold_1 & gross_income_single < threshold_2
replace income_tax = 0.20 * (threshold_1 - personal_allowance) 	///
					+ 0.40 * (threshold_2 - threshold_1)  		///
					+ 0.45 * (gross_income_single - threshold_2) 		if gross_income_single >= threshold_2 


gen pt = .
replace pt = 7.748 	if inrange(date, mdy(4, 1, 2013), mdy(3, 31, 2014)) 
replace pt = 7.956 	if inrange(date, mdy(4, 1, 2014), mdy(3, 31, 2015)) 
replace pt = 8.060 	if inrange(date, mdy(4, 1, 2015), mdy(3, 31, 2016)) 
replace pt = 8.060 	if inrange(date, mdy(4, 1, 2016), mdy(3, 31, 2017)) 
replace pt = 8.164 	if inrange(date, mdy(4, 1, 2017), mdy(3, 31, 2018)) 
					
gen uel = .
replace uel = 41.444 	if inrange(date, mdy(4, 1, 2013), mdy(3, 31, 2014)) 
replace uel = 41.860 	if inrange(date, mdy(4, 1, 2014), mdy(3, 31, 2015)) 
replace uel = 42.380 	if inrange(date, mdy(4, 1, 2015), mdy(3, 31, 2016)) 
replace uel = 43.004 	if inrange(date, mdy(4, 1, 2016), mdy(3, 31, 2017)) 
replace uel = 45.032 	if inrange(date, mdy(4, 1, 2017), mdy(3, 31, 2018)) 
					
					
gen national_insurance = .
replace national_insurance = 0 										if gross_income_single < pt
replace national_insurance = 0.12 * (gross_income_single - pt) 		if gross_income_single >= pt & gross_income_single < uel
replace national_insurance = 0.12 * (uel - pt) 	///
							+ 0.02 * (gross_income_single - uel)	if gross_income_single >= uel


gen net_income_single 	= gross_income_single - income_tax - national_insurance

gen 	net_income 			= .
replace net_income 			= net_income_single 	if income_basis ==3
replace net_income 			= net_income_single * 2 if income_basis ==2
replace net_income 			= net_income_single 	if income_basis ==1 // this can be problematic as income basis in unknown

gen 	PTInet 				= payment / (net_income/12) * 100


*--- Broker
gen broker = strpos(saleschannelname, "Inter") > 0
tab broker if inrange(year, 2016, 2017) & htb_dummy == 0
tab broker if inrange(year, 2016, 2017) & htb_dummy == 1
tab broker if inrange(year, 2013, 2015) 

gen subsample = inrange(date, mdy(4, 2, 2013), mdy(4, 2, 2015)) & dealtype == 2 & LTV>20

save "${datadir}\HTB_main_1m", replace


keep if property_val <=600	
keep if purchaseprice <=600



/* Winsorize outliers */   
local var_out 	"loan_val interest LTI LTV" //  gross_income  
								
foreach var of local var_out {
	gen `var'_original = `var'
	sum `var', d
	scalar p1=r(p1)
	scalar p99=r(p99)
	replace `var' = p1 if `var' < p1
	replace `var' = p99 if `var' > p99 & `var' < .
}

save "${datadir}\HTB_main", replace



*--- Subsample for London HTB analysis
keep if inrange(date, mdy(8, 1, 2015), mdy(7, 31, 2016)) & htb_dummy
gen post = date > mdy(1, 31, 2016)
keep if region == 5 | region == 6
save "${datadir}\HTB_main_cleaned_feb2016", replace

use "${datadir}\HTB_main", replace
keep if inrange(date, mdy(8, 1, 2014), mdy(7, 31, 2015)) & htb_dummy
gen post = date > mdy(1, 31, 2015)
save "${datadir}\HTB_main_cleaned_feb2015_placebo", replace



*--- Subsample for repayment analysis
use "${datadir}\HTB_main", replace
keep if htb_dummy == 1
keep if inrange(date, mdy(4, 2, 2013), mdy(4, 2, 2015))
label data "Repayment subsample created by 2_run_HTB_mergedata.do"
save "${datadir}\HTB_repayment", replace

